syntax = "proto3";

package tensorflow.profiler;

// Types of memory bandwidth we track in the system.
enum MemBwType {
  option allow_alias = true;

  // We use FIRST and LAST enum values to be able to iterate over this enum
  // in TypeScript, since the _MIN and _MAX values are not automatically
  // available as in C++.
  MEM_BW_TYPE_FIRST = 0;
  // Aggregated BW across on-chip and off-chip memory.
  MEM_BW_TYPE_HBM_RW = 0;
  // On-chip memory read bw.
  MEM_BW_TYPE_SRAM_RD = 1;
  // On-chip memory write bw.
  MEM_BW_TYPE_SRAM_WR = 2;

  // Leave last.
  MEM_BW_TYPE_MAX = 2;
}

// Tensorflow generic memory space names.
enum MemorySpace {
  MEMORY_SPACE_UNDEFINED = 0;
  // Off-chip memory.
  // Assume all backends use 1 for HBM/off-chip memory.
  MEMORY_SPACE_HBM = 1;
  // On-chip memory.
  MEMORY_SPACE_ON_CHIP = 0x7FFFFFFE;
  // Any memory.
  MEMORY_SPACE_ALL = 0x7FFFFFFF;
}

// What the dimension represents, e.g. spatial, feature or batch.
enum LayoutDimensionSemantics {
  UNKNOWN_SEMANTICS = 0;
  FEATURE = 1;
  BATCH = 2;
  SPATIAL = 3;
}

// Data layout of an op.
message LayoutAnalysis {
  // Physical data layout in each tensor dimension.
  message Dimension {
    // Size of the data in this dimension.
    int32 size = 1;
    // Data must be padded to a multiple of alignment.
    int32 alignment = 2;
    // What the dimension represents.
    LayoutDimensionSemantics semantics = 3;
  }
  // The physical data layout, from most-minor to most-major dimensions.
  repeated Dimension dimensions = 1;
}

// A container to serialize this repeated field in "symbolized xplane."
message MemoryAccessBreakdown {
  repeated OpMetrics.MemoryAccessed memory_accessed = 1;
}

// Metrics for an operation (accumulated over all occurrences).
// Next ID: 24
message OpMetrics {
  // HLO module id. 0 for TF ops.
  uint64 hlo_module_id = 13;
  // Name of this op.
  string name = 6;
  // Long name of this op (e.g., HLO expression).
  string long_name = 20;
  // Category of this op.
  string category = 11;
  // Provenance of this op (e.g., if HLO op, original TF op).
  string provenance = 12;
  // Whether it is executed eagerly.
  bool is_eager = 18;
  // Number of executions.
  uint32 occurrences = 3;
  // Total time (self + children) in picoseconds.
  uint64 time_ps = 7;
  // Minimum time (self + children) among all occurrences.
  uint64 min_time_ps = 17;
  // Total self time in picoseconds.
  uint64 self_time_ps = 1;
  // Total FLOPs.
  uint64 flops = 2;
  // Total bytes accessed.
  uint64 bytes_accessed = 5;
  // Breakdown of memory accessed by operation type and memory space.
  message MemoryAccessed {
    enum OperationType {
      UNKNOWN = 0;
      READ = 1;
      WRITE = 2;
    }
    OperationType operation_type = 1;
    // Device-specific id of memory space.
    uint64 memory_space = 2;
    uint64 bytes_accessed = 3;
  }
  repeated MemoryAccessed memory_accessed_breakdown = 19;
  // Total dma stall time in picoseconds.
  uint64 dma_stall_ps = 10;
  // The data layout for this op. Only set for convolution ops for now.
  LayoutAnalysis layout = 14;
  // Deduplicated HLO name for this op. Not set for TF ops.
  string deduplicated_name = 15;
  // Children of the op. e.g. fused ops if this op is fusion.
  OpMetricsDb children = 16;
  // Number of cores this op occurs.
  uint32 num_cores = 21;
  // Computation primitive size in BITS. This is the size of the type of the
  // hardware computation. In the future this may be extended to include info
  // such as signed/unsigned, int/fp, etc. Currently only the size is needed.
  uint32 computation_primitive_size = 22;
  // Whether the op is autotuned.
  bool autotuned = 23;
  reserved 4, 8, 9;
}

// Statistics about the various precision used in computation.
message PrecisionStats {
  // Amount of time spent on 16-bit computation (in ps).
  uint64 compute_16bit_ps = 1;
  // Amount of time spent on 32-bit computation (in ps).
  uint64 compute_32bit_ps = 2;
}

// A database for OpMetrics.
// Next ID: 14
message OpMetricsDb {
  // A bunch of OpMetrics.
  repeated OpMetrics metrics_db = 10;
  // The total host infeed-enqueue duration in picoseconds.
  uint64 total_host_infeed_enq_duration_ps = 2;
  // The total of the difference between the start times of two
  // consecutive infeed-enqueues (per host) in picoseconds.
  uint64 total_host_infeed_enq_start_timestamp_ps_diff = 3;
  // The total time in picoseconds.
  uint64 total_time_ps = 11;
  // The total time incurred by OPs in picoseconds.
  uint64 total_op_time_ps = 12;
  // Precision-related stats.
  PrecisionStats precision_stats = 13;
  reserved 1, 4, 5, 6, 7, 8, 9;
}
